In [1]:
library(DESeq2)
library(ggplot2)
library(reshape2)
library(patchwork)
library(ggrepel)
library(cowplot)
library(grid)
library(RColorBrewer)
library(repr) 
library(glmpca)
library(pheatmap)
library(PoiClaClu)
library(apeglm)
library(ashr)
library(vsn)
library(dplyr)
library(tidyr)
library(viridis)  
library("pheatmap")
library("ReportingTools")
library("BiocParallel")
library(glmpca)
library(emdbook)  
library(tidyverse)
register(MulticoreParam(4))
library(sva)
library(RUVSeq)
library(GenomicRanges)
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
    table, tapply, union, unique, unsplit, which.max, which.min



Attaching package: ‘S4Vectors’


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required package: GenomicRanges

Loading required package: GenomeInfoDb

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars


Loading required package: Biobase

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.



Attaching package: ‘Biobase’


The following object is masked from ‘package:MatrixGenerics’:

    rowMedians


The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians



Attaching package: ‘cowplot’


The following object is masked from ‘package:patchwork’:

    align_plots



Attaching package: ‘dplyr’


The following object is masked from ‘package:Biobase’:

    combine


The following object is masked from ‘package:matrixStats’:

    count


The following objects are masked from ‘package:GenomicRanges’:

    intersect, setdiff, union


The following object is masked from ‘package:GenomeInfoDb’:

    intersect


The following objects are masked from ‘package:IRanges’:

    collapse, desc, intersect, setdiff, slice, union


The following objects are masked from ‘package:S4Vectors’:

    first, intersect, rename, setdiff, setequal, union


The following objects are masked from ‘package:BiocGenerics’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘tidyr’


The following object is masked from ‘package:reshape2’:

    smiths


The following object is masked from ‘package:S4Vectors’:

    expand


Loading required package: viridisLite

Loading required package: knitr



Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2



── Attaching core tidyverse packages ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ lubridate 1.9.4     ✔ stringr   1.5.1
✔ purrr     1.0.2     ✔ tibble    3.2.1
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::%within%() masks IRanges::%within%()
✖ dplyr::collapse()     masks IRanges::collapse()
✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
✖ dplyr::count()        masks matrixStats::count()
✖ dplyr::desc()         masks IRanges::desc()
✖ tidyr::expand()       masks S4Vectors::expand()
✖ dplyr::filter()       masks stats::filter()
✖ dplyr::first()        masks S4Vectors::first()
✖ dplyr::lag()          masks stats::lag()
✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
✖ dplyr::rename()       masks S4Vectors::rename()
✖ lubridate::second()   masks S4Vectors::second()
✖ lubridate::second<-() masks S4Vectors::second<-()
✖ dplyr::slice()        masks IRanges::slice()
✖ lubridate::stamp()    masks cowplot::stamp()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Loading required package: mgcv

Loading required package: nlme


Attaching package: 'nlme'


The following object is masked from 'package:dplyr':

    collapse


The following object is masked from 'package:IRanges':

    collapse


This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

Loading required package: genefilter


Attaching package: 'genefilter'


The following object is masked from 'package:readr':

    spec


The following object is masked from 'package:patchwork':

    area


The following objects are masked from 'package:MatrixGenerics':

    rowSds, rowVars


The following objects are masked from 'package:matrixStats':

    rowSds, rowVars


Loading required package: EDASeq

Loading required package: ShortRead

Loading required package: Biostrings

Loading required package: XVector


Attaching package: 'XVector'


The following object is masked from 'package:purrr':

    compact



Attaching package: 'Biostrings'


The following object is masked from 'package:grid':

    pattern


The following object is masked from 'package:base':

    strsplit


Loading required package: Rsamtools

Loading required package: GenomicAlignments


Attaching package: 'GenomicAlignments'


The following object is masked from 'package:dplyr':

    last



Attaching package: 'ShortRead'


The following object is masked from 'package:purrr':

    compose


The following object is masked from 'package:tibble':

    view


The following objects are masked from 'package:ReportingTools':

    basePath, name


The following object is masked from 'package:dplyr':

    id


Loading required package: edgeR

Loading required package: limma


Attaching package: 'limma'


The following object is masked from 'package:DESeq2':

    plotMA


The following object is masked from 'package:BiocGenerics':

    plotMA


ChIPseeker v1.42.1 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin
Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring
epigenomic datasets by ChIPseeker. Current Protocols. 2022, 2(10): e585

Loading required package: GenomicFeatures

Loading required package: AnnotationDbi


Attaching package: 'AnnotationDbi'


The following object is masked from 'package:dplyr':

    select




In [2]:
# Load count data
count_data <- read.csv("A549_peak_counts.txt", 
                       skip = 1, sep="\t", header=TRUE, stringsAsFactors=FALSE)  

head(count_data,2 )

# Remove the ".genomicAllAligned.sorted.bam" suffix from column names
colnames(count_data) <- sub("\\.CLAM.genomicAllAligned\\.sorted\\.bam$", "", colnames(count_data))
head(count_data, 2)
colnames(count_data)

colnames(count_data) <- sub("^([^\\.]+)\\..*", "\\1", colnames(count_data))
head(count_data, 2)
colnames(count_data, 2)
A data.frame: 2 × 18
GeneidChrStartEndStrandLengthAC1_IP.AC1_IP.CLAM.genomicAllAligned.sorted.bamAC1_NP.AC1_NP.CLAM.genomicAllAligned.sorted.bamAC2_IP.AC2_IP.CLAM.genomicAllAligned.sorted.bamAC2_NP.AC2_NP.CLAM.genomicAllAligned.sorted.bamAP1_IP.AP1_IP.CLAM.genomicAllAligned.sorted.bamAP1_NP.AP1_NP.CLAM.genomicAllAligned.sorted.bamAP2_IP.AP2_IP.CLAM.genomicAllAligned.sorted.bamAP2_NP.AP2_NP.CLAM.genomicAllAligned.sorted.bamAV1_IP.AV1_IP.CLAM.genomicAllAligned.sorted.bamAV1_NP.AV1_NP.CLAM.genomicAllAligned.sorted.bamAV2_IP.AV2_IP.CLAM.genomicAllAligned.sorted.bamAV2_NP.AV2_NP.CLAM.genomicAllAligned.sorted.bam
<chr><chr><chr><chr><chr><int><int><int><int><int><int><int><int><int><int><int><int><int>
1ENSG00000290825chr1;chr1;chr1;chr114621;14721;19121;1922114721;14821;19221;19321+;+;+;+402310200002100
2ENSG00000310526chr1;chr1;chr1;chr114656;14756;19156;1925614756;14856;19256;19356-;-;-;-402000230000100
A data.frame: 2 × 18
GeneidChrStartEndStrandLengthAC1_IP.AC1_IPAC1_NP.AC1_NPAC2_IP.AC2_IPAC2_NP.AC2_NPAP1_IP.AP1_IPAP1_NP.AP1_NPAP2_IP.AP2_IPAP2_NP.AP2_NPAV1_IP.AV1_IPAV1_NP.AV1_NPAV2_IP.AV2_IPAV2_NP.AV2_NP
<chr><chr><chr><chr><chr><int><int><int><int><int><int><int><int><int><int><int><int><int>
1ENSG00000290825chr1;chr1;chr1;chr114621;14721;19121;1922114721;14821;19221;19321+;+;+;+402310200002100
2ENSG00000310526chr1;chr1;chr1;chr114656;14756;19156;1925614756;14856;19256;19356-;-;-;-402000230000100
  1. 'Geneid'
  2. 'Chr'
  3. 'Start'
  4. 'End'
  5. 'Strand'
  6. 'Length'
  7. 'AC1_IP.AC1_IP'
  8. 'AC1_NP.AC1_NP'
  9. 'AC2_IP.AC2_IP'
  10. 'AC2_NP.AC2_NP'
  11. 'AP1_IP.AP1_IP'
  12. 'AP1_NP.AP1_NP'
  13. 'AP2_IP.AP2_IP'
  14. 'AP2_NP.AP2_NP'
  15. 'AV1_IP.AV1_IP'
  16. 'AV1_NP.AV1_NP'
  17. 'AV2_IP.AV2_IP'
  18. 'AV2_NP.AV2_NP'
A data.frame: 2 × 18
GeneidChrStartEndStrandLengthAC1_IPAC1_NPAC2_IPAC2_NPAP1_IPAP1_NPAP2_IPAP2_NPAV1_IPAV1_NPAV2_IPAV2_NP
<chr><chr><chr><chr><chr><int><int><int><int><int><int><int><int><int><int><int><int><int>
1ENSG00000290825chr1;chr1;chr1;chr114621;14721;19121;1922114721;14821;19221;19321+;+;+;+402310200002100
2ENSG00000310526chr1;chr1;chr1;chr114656;14756;19156;1925614756;14856;19256;19356-;-;-;-402000230000100
  1. 'Geneid'
  2. 'Chr'
  3. 'Start'
  4. 'End'
  5. 'Strand'
  6. 'Length'
  7. 'AC1_IP'
  8. 'AC1_NP'
  9. 'AC2_IP'
  10. 'AC2_NP'
  11. 'AP1_IP'
  12. 'AP1_NP'
  13. 'AP2_IP'
  14. 'AP2_NP'
  15. 'AV1_IP'
  16. 'AV1_NP'
  17. 'AV2_IP'
  18. 'AV2_NP'
In [3]:
count_data2 <- count_data %>%
  mutate(
    Chr = sapply(strsplit(Chr, ";"), `[`, 1),
    Start = sapply(strsplit(Start, ";"), function(x) min(as.integer(x))),
    End = sapply(strsplit(End, ";"), function(x) max(as.integer(x)))
  )

colnames(count_data2)
head(count_data2, 2)
  1. 'Geneid'
  2. 'Chr'
  3. 'Start'
  4. 'End'
  5. 'Strand'
  6. 'Length'
  7. 'AC1_IP'
  8. 'AC1_NP'
  9. 'AC2_IP'
  10. 'AC2_NP'
  11. 'AP1_IP'
  12. 'AP1_NP'
  13. 'AP2_IP'
  14. 'AP2_NP'
  15. 'AV1_IP'
  16. 'AV1_NP'
  17. 'AV2_IP'
  18. 'AV2_NP'
A data.frame: 2 × 18
GeneidChrStartEndStrandLengthAC1_IPAC1_NPAC2_IPAC2_NPAP1_IPAP1_NPAP2_IPAP2_NPAV1_IPAV1_NPAV2_IPAV2_NP
<chr><chr><int><int><chr><int><int><int><int><int><int><int><int><int><int><int><int><int>
1ENSG00000290825chr11462119321+;+;+;+402310200002100
2ENSG00000310526chr11465619356-;-;-;-402000230000100
In [4]:
colnames(count_data2)
head(count_data2, 18)
  1. 'Geneid'
  2. 'Chr'
  3. 'Start'
  4. 'End'
  5. 'Strand'
  6. 'Length'
  7. 'AC1_IP'
  8. 'AC1_NP'
  9. 'AC2_IP'
  10. 'AC2_NP'
  11. 'AP1_IP'
  12. 'AP1_NP'
  13. 'AP2_IP'
  14. 'AP2_NP'
  15. 'AV1_IP'
  16. 'AV1_NP'
  17. 'AV2_IP'
  18. 'AV2_NP'
A data.frame: 18 × 18
GeneidChrStartEndStrandLengthAC1_IPAC1_NPAC2_IPAC2_NPAP1_IPAP1_NPAP2_IPAP2_NPAV1_IPAV1_NPAV2_IPAV2_NP
<chr><chr><int><int><chr><int><int><int><int><int><int><int><int><int><int><int><int><int>
1ENSG00000290825chr1 14621 19321+;+;+;+ 402 3 1 0 2 0 0 0 0 2 1 0 0
2ENSG00000310526chr1 14656 19356-;-;-;- 402 0 0 0 2 3 0 0 0 0 1 0 0
3ENSG00000227232chr1 14696 19296-;- 202 0 0 0 0 0 0 0 0 0 0 0 0
4ENSG00000310528chr1 189386 195386-;-;- 303 15 11 28 21 21 5 21 7 2 17 5 15
5ENSG00000310527chr1 187146 195346-;-;-;-;-;-;- 707 79 38 86 46 124 16 112 13 13 55 14 38
6ENSG00000293331chr1 629954 632454-;-;-;-;-;- 606 0 0 0 0 0 0 0 0 0 1 0 0
7ENSG00000225880chr1 629745 634745-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-2011 916 3441057237 7751611236123 312 971 2901315
8ENSG00000225630chr1 630340 630440+ 101 0 0 0 0 0 0 0 0 1 0 0 0
9ENSG00000237973chr1 631274 632474+;+;+ 303 23 8 16 9 19 5 25 1 2 24 5 21
10ENSG00000278791chr1 632325 632425- 101 0 0 1 0 0 0 0 0 2 0 0 1
11ENSG00000248527chr1 633696 634196+;+ 202400413134455827374687440915091094392013472164
12ENSG00000198744chr1 634476 634676+;+ 201 0 0 0 0 0 0 0 0 0 0 0 0
13ENSG00000237491chr1 790239 790339+ 101 3 10 12 5 33 4 1 5 5 14 6 10
14ENSG00000228794chr1 827038 856238+;+;+;+ 404 53 112 59107 65 48 53 29 17 38 14 23
15ENSG00000188290chr1 9999621000062- 101 64 11 86 21 66 5 91 2 8 53 10 11
16ENSG00000187608chr110142381014338+ 101 170 12 232 17 37 5 48 3 10 148 2 76
17ENSG00000242590chr110554331055533+ 101 450 17 709 22 536 14 865 4 51 278 64 141
18ENSG00000131591chr110819181082918-;-;-;-;-;-;-;-;-;- 1001 121 84 162149 121 64 250 30 22 79 38 35
In [5]:
print("number of peaks:")
dim(count_data2)[1]

# Convert to GRanges object
gr <- GRanges(
  seqnames = count_data2$Chr,
  ranges = IRanges(start = count_data2$Start, end = count_data2$End)
)

# Merge overlapping intervals
gr_merge <- GenomicRanges::reduce(gr)

# Convert back to data frame
df_merge <- as.data.frame(gr_merge)[, c("seqnames", "start", "end")]
colnames(df_merge) <- c("Chr", "Start", "End")

# Print or return merged peaks
dim(df_merge)[1]

# Annotate merged peaks to genome features
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

peak_anno <- annotatePeak(gr_merge, 
                          TxDb = txdb, 
                          tssRegion = c(-3000, 3000), 
                          annoDb = "org.Hs.eg.db")

# 📊 Plot annotation breakdown 
print("the annotation of the peaks:")
plotAnnoPie(peak_anno)
[1] "number of peaks:"
11828
8747
>> preparing features information...		 2025-04-18 04:01:03 PM 
>> identifying nearest features...		 2025-04-18 04:01:05 PM 
>> calculating distance from peak to TSS...	 2025-04-18 04:01:05 PM 
>> assigning genomic annotation...		 2025-04-18 04:01:05 PM 
>> adding gene annotation...			 2025-04-18 04:01:25 PM 
'select()' returned 1:many mapping between keys and columns

>> assigning chromosome lengths			 2025-04-18 04:01:25 PM 
>> done...					 2025-04-18 04:01:25 PM 
[1] "the annotation of the peaks:"
No description has been provided for this image
In [ ]:

In [6]:
# Select _IP columns + Geneid
ip <- count_data2 %>% dplyr::select(Geneid, ends_with("_IP"))
head(ip, 2)

# Select _NP columns + Geneid
np <- count_data2 %>% dplyr::select(Geneid, ends_with("_NP"))
head(np, 2)
A data.frame: 2 × 7
GeneidAC1_IPAC2_IPAP1_IPAP2_IPAV1_IPAV2_IP
<chr><int><int><int><int><int><int>
1ENSG00000290825300020
2ENSG00000310526003000
A data.frame: 2 × 7
GeneidAC1_NPAC2_NPAP1_NPAP2_NPAV1_NPAV2_NP
<chr><int><int><int><int><int><int>
1ENSG00000290825120010
2ENSG00000310526020010
In [ ]:

In [7]:
print("DESeq2 to measure the differential binding across IP samples :")
[1] "DESeq2 to measure the differential binding across IP samples :"
In [8]:
# Step 1: Select only columns ending with "_IP" and include Geneid for rownames
ip <- count_data2 %>%
     dplyr::select(Geneid, ends_with("_IP"))

# Step 2: Set Geneid as rownames and remove Geneid column
rownames(ip) <- ip$Geneid
ip <- ip[, -1]

# Step 3: Verify column names
sample_names <- colnames(ip)
print(sample_names)

# Step 4: Assign condition groups (AC, AP, AV)
conditions <- ifelse(grepl("^AC", sample_names), "AC",
              ifelse(grepl("^AP", sample_names), "AP",
              ifelse(grepl("^AV", sample_names), "AV", NA)))
print(conditions)

# Step 5: Error handling if any unrecognized sample names
if (any(is.na(conditions))) {
  stop("Some sample names do not match expected patterns (AC, AP, AV). Check column names!")
}

# Step 6: Create colData
col_data <- data.frame(
  row.names = sample_names,
  condition = factor(conditions, levels = c("AC", "AP", "AV"))
)
print("col data:")
print(col_data)

# Step 7: Remove rows with NA values
dim(ip)
ip <- ip[complete.cases(ip), ]
dim(ip)

# Step 8: Compute summary statistics directly on ip (only numeric columns)
summary_stats <- data.frame(
  Median = apply(ip, 2, median, na.rm = TRUE),
  Min    = apply(ip, 2, min, na.rm = TRUE),
  Max    = apply(ip, 2, max, na.rm = TRUE)
)

# Step 9: Print the result
print(summary_stats)
[1] "AC1_IP" "AC2_IP" "AP1_IP" "AP2_IP" "AV1_IP" "AV2_IP"
[1] "AC" "AC" "AP" "AP" "AV" "AV"
[1] "col data:"
       condition
AC1_IP        AC
AC2_IP        AC
AP1_IP        AP
AP2_IP        AP
AV1_IP        AV
AV2_IP        AV
  1. 11828
  2. 6
  1. 11828
  2. 6
       Median Min    Max
AC1_IP  179.0   0 588918
AC2_IP  240.5   0 606823
AP1_IP  187.0   0 504829
AP2_IP  218.0   0 691186
AV1_IP   55.0   0 151888
AV2_IP   60.0   0 157687
In [9]:
# Create DESeq2 dataset
dds <- DESeqDataSetFromMatrix(countData = ip, colData = col_data, design = ~condition)

# Set reference level for condition (AV will be the baseline)
dds$condition <- relevel(dds$condition, ref = "AV")

# Print number of genes before filtering
cat("Number of genes before filtering:", nrow(dds), "\n")

# Estimate size factors (required for normalization)
dds <- estimateSizeFactors(dds)

# Filter: keep genes with normalized count >= 4 in at least 4 samples
keep <- rowSums(counts(dds, normalized = TRUE) >= 4) >= 4
dds <- dds[keep, ]

# Print number of genes after filtering
cat("Number of genes after filtering:", nrow(dds), "\n")

# Run DESeq2 differential expression analysis
dds <- DESeq(dds)

# Extract results table
# res <- results(dds) 
# it will produce : Wald test p-value: condition AC vs AV 

# View summary of results
# cat("First row of DE results:\n")
# print(head(res, 1))
# cat("Last row of DE results:\n")
# print(tail(res, 1))
# cat("Summary of DESeq2 results:\n")
# print(summary(res))

# Show contrast names
cat("Available result contrasts:\n")
print(resultsNames(dds))

# Size factors (re-estimation here is harmless but redundant)
cat("The size factors are:\n")
print(sizeFactors(dds))

# Extract normalized counts
norm_counts <- counts(dds, normalized = TRUE)

# Preview normalized counts
cat("Preview of normalized counts:\n")
print(head(norm_counts, 2))

# Save normalized counts to CSV
write.csv(norm_counts, "A549.peaks.IP.samples.normalized.counts.csv", row.names = TRUE)

# Compute summary statistics for each sample (column)
summary_stats2 <- data.frame(
  Median = apply(norm_counts, 2, median, na.rm = TRUE),
  Min    = apply(norm_counts, 2, min, na.rm = TRUE),
  Max    = apply(norm_counts, 2, max, na.rm = TRUE)
)

# Print summary statistics
cat("Summary of the normalized counts:\n")
print(round(summary_stats2, 2))

# Show available assays in the DESeqDataSet
cat("Available assays in dds:\n")
print(names(assays(dds)))

# View fitted means (mu) and Cook's distances
cat("DESeq2 fitted means (mu):\n")
print(head(assay(dds, "mu"), 2))

cat("DESeq2 Cook's distances (cooks):\n")
print(head(assay(dds, "cooks"), 2))
Number of genes before filtering: 11828 
Number of genes after filtering: 10263 
using pre-existing size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

Available result contrasts:
[1] "Intercept"          "condition_AC_vs_AV" "condition_AP_vs_AV"
The size factors are:
   AC1_IP    AC2_IP    AP1_IP    AP2_IP    AV1_IP    AV2_IP 
1.3569838 1.7806372 1.4003870 1.6497101 0.4226442 0.4513182 
Preview of normalized counts:
                  AC1_IP   AC2_IP   AP1_IP   AP2_IP    AV1_IP   AV2_IP
ENSG00000310528 11.05393 15.72471 14.99586 12.72951  4.732113 11.07866
ENSG00000310527 58.21735 48.29732 88.54695 67.89072 30.758732 31.02024
Summary of the normalized counts:
       Median Min      Max
AC1_IP 185.71   0 433990.5
AC2_IP 187.01   0 340789.8
AP1_IP 182.09   0 360492.5
AP2_IP 186.70   0 418974.2
AV1_IP 182.19   0 359375.6
AV2_IP 186.12   0 349392.1
Available assays in dds:
[1] "counts" "mu"     "H"      "cooks" 
DESeq2 fitted means (mu):
                  AC1_IP   AC2_IP    AP1_IP    AP2_IP   AV1_IP    AV2_IP
ENSG00000310528 18.24995 23.94762  19.38885  22.84082  3.36674  3.595154
ENSG00000310527 72.11762 94.63291 109.39270 128.86884 13.05625 13.942046
DESeq2 Cook's distances (cooks):
                    AC1_IP     AC2_IP     AP1_IP     AP2_IP       AV1_IP
ENSG00000310528 0.17580020 0.20951993 0.04063713 0.04515130 2.934290e-01
ENSG00000310527 0.02695118 0.02864283 0.05604267 0.05755543 4.290547e-05
                      AV2_IP
ENSG00000310528 3.220225e-01
ENSG00000310527 4.529699e-05
In [10]:
# Extract raw (unnormalized) counts
print("raw counts")
raw_counts <- counts(dds, normalized = FALSE)
head(raw_counts, 2)

# Extract normalized counts
print("norm counts")
norm_counts <- counts(dds, normalized = TRUE)
head(norm_counts, 2)
[1] "raw counts"
A matrix: 2 × 6 of type int
AC1_IPAC2_IPAP1_IPAP2_IPAV1_IPAV2_IP
ENSG000003105281528 21 21 2 5
ENSG0000031052779861241121314
[1] "norm counts"
A matrix: 2 × 6 of type dbl
AC1_IPAC2_IPAP1_IPAP2_IPAV1_IPAV2_IP
ENSG0000031052811.0539315.7247114.9958612.72951 4.73211311.07866
ENSG0000031052758.2173548.2973288.5469567.8907230.75873231.02024
In [11]:
print("Boxplot of Raw vs log2 Normalized Counts")

# Prepare raw counts
raw_counts <- as.data.frame(counts(dds, normalized = FALSE))
raw_counts$Gene <- rownames(raw_counts)
raw_long <- pivot_longer(raw_counts, -Gene, names_to = "Sample", values_to = "Count")
raw_long$log2_count <- log2(raw_long$Count + 1)

# Prepare normalized counts
norm_counts <- as.data.frame(counts(dds, normalized = TRUE))
norm_counts$Gene <- rownames(norm_counts)
norm_long <- pivot_longer(norm_counts, -Gene, names_to = "Sample", values_to = "Count")
norm_long$log2_count <- log2(norm_long$Count + 1)

# Color palette
sample_list <- unique(c(raw_long$Sample, norm_long$Sample))
sample_colors <- setNames(viridis::viridis(length(sample_list), option = "D"), sample_list)

# Plot p1: Raw counts
p1 <- ggplot(raw_long, aes(x = Sample, y = log2_count, fill = Sample)) +
  geom_boxplot(outlier.size = 0.5, width = 0.7) +
  scale_fill_manual(values = sample_colors, name = "Sample") +
  theme_minimal(base_size = 12) +
  labs(title = "Raw Counts (log2 scale)",
       y = "log2(Counts + 1)", x = "Sample") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot p2: Normalized counts
p2 <- ggplot(norm_long, aes(x = Sample, y = log2_count, fill = Sample)) +
  geom_boxplot(outlier.size = 0.5, width = 0.7) +
  scale_fill_manual(values = sample_colors, name = "Sample") +
  theme_minimal(base_size = 12) +
  labs(title = "Normalized Counts (log2 scale)",
       y = "log2(Counts + 1)", x = "Sample") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Combine side-by-side
options(repr.plot.width = 12, repr.plot.height = 6)

# Combine side-by-side
plot_grid(p1, p2, labels = c("A", "B"), ncol = 2, align = 'h')
[1] "Boxplot of Raw vs log2 Normalized Counts"
No description has been provided for this image
In [12]:
# Extract raw counts and log-transform
raw_counts <- counts(dds, normalized = FALSE)
raw_log_counts <- log10(raw_counts + 1)

# Convert to long format for ggplot2
log1_df <- as.data.frame(raw_log_counts)
log1_df$Gene <- rownames(log1_df)
log1_long <- pivot_longer(log1_df, -Gene, names_to = "Sample", values_to = "log10_count")

# Extract normalized counts and log-transform
norm_counts <- counts(dds, normalized = TRUE)
norm_log_counts <- log10(norm_counts + 1)

# Convert to long format for ggplot2
log2_df <- as.data.frame(norm_log_counts)
log2_df$Gene <- rownames(log2_df)
log2_long <- pivot_longer(log2_df, -Gene, names_to = "Sample", values_to = "log10_count")

# Plot with ggplot2
p1 = ggplot(log1_long, aes(x = log10_count, color = Sample)) +
  geom_density(size = 1.2, alpha = 0.8) +
  theme_minimal(base_size = 14) +
  labs(
    title = "Density Plot of log10 Raw Counts (Before Normalization)",
    x = "log10(Counts + 1)",
    y = "Density"
  ) +
  theme(
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.position = "right"
  )
# Plot with ggplot2
p2 = ggplot(log2_long, aes(x = log10_count, color = Sample)) +
  geom_density(size = 1.2, alpha = 0.8) +
  theme_minimal(base_size = 14) +
  labs(
    title = "Density Plot of log10 Normalized Counts",
    x = "log10(Normalized Counts + 1)",
    y = "Density"
  ) +
  theme(
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.position = "right"
  )

# Combine side-by-side
options(repr.plot.width = 14, repr.plot.height = 6)

# Combine side-by-side
plot_grid(p1, p2, labels = c("A", "B"), ncol = 2, align = 'h')
Warning message:
"Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead."
No description has been provided for this image
In [13]:
print("printing the results")
resultsNames(dds)
[1] "printing the results"
  1. 'Intercept'
  2. 'condition_AC_vs_AV'
  3. 'condition_AP_vs_AV'
In [14]:
print("No shrinkage")

# Get results for different comparisons
res_AP_vs_AC <- results(dds, contrast = c("condition", "AP", "AC"))
res_AP_vs_AV <- results(dds, contrast = c("condition", "AP", "AV"))
res_AC_vs_AV <- results(dds, contrast = c("condition", "AC", "AV"))

# summary(res_AP_vs_AC)
# summary(res_AC_vs_AV)
# summary(res_AP_vs_AV)

# Save results
write.csv(as.data.frame(res_AP_vs_AC), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AC_results.csv")
write.csv(as.data.frame(res_AP_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AV_results.csv")
write.csv(as.data.frame(res_AC_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AC_vs_AV_results.csv")

###########################################################
###########################################################

print("number of differentially bound transcripts : AP vs AC : pvalue < 0.05 and padj < 0.1")
dim(subset(res_AP_vs_AC, pvalue < 0.05))[1]
dim(subset(res_AP_vs_AC, padj < 0.1))[1]

print("number of differentially bound transcripts : AP vs AV : pvalue < 0.05 and padj < 0.1")
dim(subset(res_AP_vs_AV, pvalue < 0.05))[1]
dim(subset(res_AP_vs_AV, padj < 0.1))[1]

print("number of differentially bound transcripts : AC vs AV : pvalue < 0.05 and padj < 0.1")
dim(subset(res_AC_vs_AV, pvalue < 0.05))[1]
dim(subset(res_AC_vs_AV, padj < 0.1))[1]

###########################################################
###########################################################
[1] "No shrinkage"
[1] "number of differentially bound transcripts : AP vs AC : pvalue < 0.05 and padj < 0.1"
754
136
[1] "number of differentially bound transcripts : AP vs AV : pvalue < 0.05 and padj < 0.1"
4235
4031
[1] "number of differentially bound transcripts : AC vs AV : pvalue < 0.05 and padj < 0.1"
4195
3952
In [15]:
# type = c("apeglm", "ashr", "normal")
In [16]:
print("Data shrinkage : normal lfcShrink")
[1] "Data shrinkage : normal lfcShrink"
In [17]:
# Get results for different comparisons

resLFCnormal_AP_vs_AV <- lfcShrink(dds, contrast = c("condition", "AP", "AV"), type="normal")
resLFCnormal_AC_vs_AV <- lfcShrink(dds, contrast = c("condition", "AC", "AV"), type="normal")
resLFCnormal_AP_vs_AC <- lfcShrink(dds, contrast = c("condition", "AP", "AC"), type="normal")

# summary(resLFCnormal_AP_vs_AC)
# summary(resLFCnormal_AC_vs_AV)
# summary(resLFCnormal_AP_vs_AV)

# Save results
write.csv(as.data.frame(resLFCnormal_AP_vs_AC), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AC_results.resLFCnormal.csv")
write.csv(as.data.frame(resLFCnormal_AP_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AV_results.resLFCnormal.csv")
write.csv(as.data.frame(resLFCnormal_AC_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AC_vs_AV_results.resLFCnormal.csv")

###########################################################
###########################################################

print("number of differentially bound and expressed transcripts : resLFCnormal: AP vs AC : pvalue < 0.05 and padj < 0.1")
dim(subset(resLFCnormal_AP_vs_AC, pvalue < 0.05))
dim(subset(resLFCnormal_AP_vs_AC, padj < 0.1))

print("number of differentially bound and expressed transcripts : resLFCnormal : AP vs AV : pvalue < 0.05 and padj < 0.1")
dim(subset(resLFCnormal_AP_vs_AV, pvalue < 0.05))
dim(subset(resLFCnormal_AP_vs_AV, padj < 0.1))

print("number of differentially bound and expressed transcripts : resLFCashr : AC vs AV : pvalue < 0.05 and padj < 0.1")
dim(subset(resLFCnormal_AC_vs_AV, pvalue < 0.05))
dim(subset(resLFCnormal_AC_vs_AV, padj < 0.1))

###########################################################
###########################################################
using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).

Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
Reference: https://doi.org/10.1093/bioinformatics/bty895

using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).

Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
Reference: https://doi.org/10.1093/bioinformatics/bty895

using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).

Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
Reference: https://doi.org/10.1093/bioinformatics/bty895

[1] "number of differentially bound and expressed transcripts : resLFCnormal: AP vs AC : pvalue < 0.05 and padj < 0.1"
  1. 754
  2. 6
  1. 136
  2. 6
[1] "number of differentially bound and expressed transcripts : resLFCnormal : AP vs AV : pvalue < 0.05 and padj < 0.1"
  1. 4235
  2. 6
  1. 4031
  2. 6
[1] "number of differentially bound and expressed transcripts : resLFCashr : AC vs AV : pvalue < 0.05 and padj < 0.1"
  1. 4195
  2. 6
  1. 3952
  2. 6
In [18]:
print("Data shrinkage : ashr lfcShrink")
[1] "Data shrinkage : ashr lfcShrink"
In [19]:
# If you must use contrast, you should use type="normal" or type="ashr" instead of apeglm, 
# because apeglm only works with coef.  
# Apeglm is the recommended method for log-fold change shrinkage.

# Get results for different comparisons
# resLFCapeglm_AP_vs_AV <- lfcShrink(dds, coef = "condition_AP_vs_AV", type="apeglm")
# resLFCapeglm_AC_vs_AV <- lfcShrink(dds, coef = "condition_AC_vs_AV", type="apeglm")

resLFCashr_AP_vs_AV <- lfcShrink(dds, contrast = c("condition", "AP", "AV"), type="ashr")
resLFCashr_AC_vs_AV <- lfcShrink(dds, contrast = c("condition", "AC", "AV"), type="ashr")
resLFCashr_AP_vs_AC <- lfcShrink(dds, contrast = c("condition", "AP", "AC"), type="ashr")

# summary(resLFCashr_AP_vs_AC)
# summary(resLFCashr_AC_vs_AV)
# summary(resLFCashr_AP_vs_AV)

# Save results
write.csv(as.data.frame(resLFCashr_AP_vs_AC), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AC_results.resLFCashr.csv")
write.csv(as.data.frame(resLFCashr_AP_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AV_results.resLFCashr.csv")
write.csv(as.data.frame(resLFCashr_AC_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AC_vs_AV_results.resLFCashr.csv")

###########################################################
###########################################################

print("number of differentially bound and expressed transcripts : resLFCashr : AP vs AC : pvalue < 0.05 and padj < 1")
dim(subset(resLFCashr_AP_vs_AC, pvalue < 0.05))[1]
dim(subset(resLFCashr_AP_vs_AC, padj < 0.1))[1]

print("number of differentially bound and expressed transcripts : resLFCashr : AP vs AV : pvalue < 0.05 ")
dim(subset(resLFCashr_AP_vs_AV, pvalue < 0.05))[1]
dim(subset(resLFCashr_AP_vs_AV, padj < 0.1))[1]

print("number of differentially bound and expressed transcripts : resLFCashr : AC vs AV : pvalue < 0.05")
dim(subset(resLFCashr_AC_vs_AV, pvalue < 0.05))[1]
dim(subset(resLFCashr_AC_vs_AV, padj < 0.1))[1]

###########################################################
###########################################################
using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

[1] "number of differentially bound and expressed transcripts : resLFCashr : AP vs AC : pvalue < 0.05 and padj < 1"
754
136
[1] "number of differentially bound and expressed transcripts : resLFCashr : AP vs AV : pvalue < 0.05 "
4235
4031
[1] "number of differentially bound and expressed transcripts : resLFCashr : AC vs AV : pvalue < 0.05"
4195
3952
In [20]:
# If you must use contrast, you should use type="normal" or type="ashr" instead of apeglm, 
# because apeglm only works with coef.  
# Apeglm is the recommended method for log-fold change shrinkage.

# Get results for different comparisons
# resLFCapeglm_AP_vs_AV <- lfcShrink(dds, coef = "condition_AP_vs_AV", type="apeglm")
# resLFCapeglm_AC_vs_AV <- lfcShrink(dds, coef = "condition_AC_vs_AV", type="apeglm")

resLFCashr_AP_vs_AV <- lfcShrink(dds, contrast = c("condition", "AP", "AV"), type="ashr")
resLFCashr_AC_vs_AV <- lfcShrink(dds, contrast = c("condition", "AC", "AV"), type="ashr")
resLFCashr_AP_vs_AC <- lfcShrink(dds, contrast = c("condition", "AP", "AC"), type="ashr")

# summary(resLFCashr_AP_vs_AC)
# summary(resLFCashr_AC_vs_AV)
# summary(resLFCashr_AP_vs_AV)

# Save results
write.csv(as.data.frame(resLFCashr_AP_vs_AC), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AC_results.resLFCashr.csv")
write.csv(as.data.frame(resLFCashr_AP_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AV_results.resLFCashr.csv")
write.csv(as.data.frame(resLFCashr_AC_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AC_vs_AV_results.resLFCashr.csv")

###########################################################
###########################################################

print("number of differentially bound and expressed transcripts : resLFCashr : AP vs AC : pvalue < 0.05 and padj < 1")
dim(subset(resLFCashr_AP_vs_AC, pvalue < 0.05))[1]
dim(subset(resLFCashr_AP_vs_AC, padj < 0.1))[1]

print("number of differentially bound and expressed transcripts : resLFCashr : AP vs AV : pvalue < 0.05 and padj < 1")
dim(subset(resLFCashr_AP_vs_AV, pvalue < 0.05))[1]
dim(subset(resLFCashr_AP_vs_AV, padj < 0.1))[1]

print("number of differentially bound and expressed transcripts : resLFCashr : AC vs AV : pvalue < 0.05 and padj < 1")
dim(subset(resLFCashr_AC_vs_AV, pvalue < 0.05))[1]
dim(subset(resLFCashr_AC_vs_AV, padj < 0.1))[1]

###########################################################
###########################################################
using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

[1] "number of differentially bound and expressed transcripts : resLFCashr : AP vs AC : pvalue < 0.05 and padj < 1"
754
136
[1] "number of differentially bound and expressed transcripts : resLFCashr : AP vs AV : pvalue < 0.05 and padj < 1"
4235
4031
[1] "number of differentially bound and expressed transcripts : resLFCashr : AC vs AV : pvalue < 0.05 and padj < 1"
4195
3952
In [ ]:

In [21]:
print("Comparing the number of DE genes for the comparison : AP vs AC for a pvalue < 0.05")

# Define thresholds
pval_cutoff <- 0.05
lfc_cutoff <- 0.5

# The information about DE peaks was stored in :
# res_AP_vs_AC 
# res_AP_vs_AV 
# res_AC_vs_AV 

# resLFCnormal_AP_vs_AV
# resLFCnormal_AC_vs_AV 
# resLFCnormal_AP_vs_AC 

# resLFCashr_AP_vs_AV 
# resLFCashr_AC_vs_AV 
# resLFCashr_AP_vs_AC 

# Count DEGs
n_DE_unshrunken <- sum(res_AP_vs_AC$pvalue < 0.05, na.rm = TRUE)
n_DE_shrink_normal <- sum(resLFCnormal_AP_vs_AC$pvalue < 0.05, na.rm = TRUE)  # Same p-values as unshrunken
n_DE_ashr <- sum( resLFCashr_AP_vs_AC$pvalue < 0.05, na.rm = TRUE)            # Same p-values as unshrunken

# Build a data frame
compare_df1 <- data.frame(
  Method = c("Unshrunken", "Shrink: normal", "Shrink: ashr"),
  DE_Genes = c(n_DE_unshrunken, n_DE_shrink_normal, n_DE_ashr)
)

# Plot it
p1 = ggplot(compare_df1, aes(x = Method, y = DE_Genes, fill = Method)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Comparison of DE Gene Counts (AC vs AP)",
    y = "Number of DE Genes (p < 0.05 or s < 0.05)",
    x = "Shrinkage Method"
  ) +
  theme(legend.position = "none") +
  geom_text(aes(label = DE_Genes), vjust = -0.3, size = 4)

print("Comparing the number of DE genes for the comparison : AP vs AC for a pvalue < 0.05")

# Shrinkage Methods: The lfcShrink() function in DESeq2 is used to obtain more accurate estimates of log2 fold changes, 
# especially for genes with low counts or high variability.
# Threshold Selection: The choice of a log2FC threshold (e.g., 0.3) is somewhat arbitrary and should be based on the biological context 
# and the desired stringency of the analysis.
# Interpretation: Comparing the number of DE genes across different shrinkage methods can provide insights into the robustness of your findings. 
# It's common to observe variations in the number of DE genes identified, depending on the method used.

# Raw (non-shrunk)
n_raw <- sum(res_AP_vs_AC$pvalue < pval_cutoff & abs(res_AP_vs_AC$log2FoldChange) > lfc_cutoff, na.rm = TRUE)
# Normal shrink
n_normal <- sum(res_AP_vs_AC$pvalue < pval_cutoff & abs(resLFCnormal_AP_vs_AC$log2FoldChange) > lfc_cutoff, na.rm = TRUE)
# Ashr shrink (using s-value instead of p-value)
n_ashr <- sum(resLFCashr_AP_vs_AC$pvalue < pval_cutoff & abs(resLFCashr_AP_vs_AC$log2FoldChange) > lfc_cutoff, na.rm = TRUE)

# Combine into a data frame
compare_df2 <- data.frame(
  Method = c("Unshrunken", "Shrink: normal", "Shrink: ashr"),
  DE_Genes = c(n_raw, n_normal, n_ashr)
)

p2 = ggplot(compare_df2, aes(x = Method, y = DE_Genes, fill = Method)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "DE Gene Counts (AP vs AC)",
    subtitle = "p < 0.05 and |log2FC| > 0.5",
    y = "Number of DE Genes",
    x = "Method"
  ) +
  geom_text(aes(label = DE_Genes), vjust = -0.3, size = 4) +
  theme(legend.position = "none")


# Print the plot in Jupyter
options(repr.plot.width = 8, repr.plot.height = 6)
p1 + p2
[1] "Comparing the number of DE genes for the comparison : AP vs AC for a pvalue < 0.05"
[1] "Comparing the number of DE genes for the comparison : AP vs AC for a pvalue < 0.05"
No description has been provided for this image
In [ ]:

In [22]:
print("MA plots:")

# Define thresholds
pval_cutoff <- 0.05
lfc_cutoff <- 0.5
[1] "MA plots:"
In [23]:
make_MA_plot <- function(res_df, title = "MA Plot", lfc_cutoff = 0.3, pval_cutoff = 0.1, ylim = c(-2, 2)) {
  
  res_df <- as.data.frame(res_df)
  
  # Replace NA p-values with the threshold so they are not considered significant
  res_df$pvalue[is.na(res_df$pvalue)] <- 1
  
  # Label significance based on thresholds
  res_df$sig <- ifelse(res_df$pvalue < pval_cutoff & abs(res_df$log2FoldChange) > lfc_cutoff,
                       "Significant", "Not Significant")

  # Generate the MA plot
  ggplot(res_df, aes(x = baseMean, y = log2FoldChange, color = sig)) +
    geom_point(alpha = 0.6, size = 1) +
    scale_x_log10() +
    scale_color_manual(values = c("Significant" = "#D7263D", "Not Significant" = "gray70")) +
    geom_hline(yintercept = c(-lfc_cutoff, lfc_cutoff), linetype = "dashed", color = "black") +
    coord_cartesian(ylim = ylim) +
    theme_minimal(base_size = 14) +
    labs(
      title = title,
      x = "Mean Expression (log10 scale)",
      y = "log2 Fold Change",
      color = "Significance"
    ) +
    theme(
      legend.position = "right",
      panel.grid.minor = element_blank()
    )
}
In [24]:
print("MA plots:")
print("AP vs AC")

# Create the plots
p1 <- make_MA_plot(res_AP_vs_AC, title = "AP vs AC (no shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)
p2 <- make_MA_plot(resLFCnormal_AP_vs_AC, title = "AP vs AC (normal shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)
p3 <- make_MA_plot(resLFCashr_AP_vs_AC, title = "AP vs AC (ashr shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)

# Combine side-by-side
options(repr.plot.width = 20, repr.plot.height = 6)

# Combine side-by-side
plot_grid(p1, p2, p3, labels = c("A", "B", "C"), ncol = 3, align = 'h')
[1] "MA plots:"
[1] "AP vs AC"
No description has been provided for this image
In [25]:
print("MA plots:")
print("AC vs AV")

# Create the plots
p1 <- make_MA_plot(res_AC_vs_AV, title = "AC vs AV (no shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)
p2 <- make_MA_plot(resLFCnormal_AC_vs_AV, title = "AC vs AV (normal shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)
p3 <- make_MA_plot(resLFCashr_AC_vs_AV, title = "AC vs AV (ashr shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)

# Combine side-by-side
options(repr.plot.width = 18, repr.plot.height = 6)

# Combine side-by-side
plot_grid(p1, p2, p3, labels = c("A", "B", "C"), ncol = 3, align = 'h')
[1] "MA plots:"
[1] "AC vs AV"
No description has been provided for this image
In [26]:
print("MA plots:")
print("AP vs AV")

# Create the plots
p1 <- make_MA_plot(res_AP_vs_AV, title = "AP vs AV (no shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)
p2 <- make_MA_plot(resLFCnormal_AP_vs_AV, title = "AP vs AV (normal shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)
p3 <- make_MA_plot(resLFCashr_AP_vs_AV, title = "AP vs AV (ashr shrinkage)", lfc_cutoff = 0.3, pval_cutoff = 0.1)

# Combine side-by-side
options(repr.plot.width = 18, repr.plot.height = 6)

# Combine side-by-side
plot_grid(p1, p2, p3, labels = c("A", "B", "C"), ncol = 3, align = 'h')
[1] "MA plots:"
[1] "AP vs AV"
No description has been provided for this image
In [ ]:

In [27]:
print("plotting dispersion")

# Combine side-by-side
options(repr.plot.width = 6, repr.plot.height = 6)

plotDispEsts(dds)
dispersionFunction(dds)
[1] "plotting dispersion"
structure(function (q) 
coefs[1] + coefs[2]/q, coefficients = c(asymptDisp = 0.0241801588503249, 
extraPois = 2.70666357215647), fitType = "parametric", varLogDispEsts = 0.898737824332759, dispPriorVar = 0.25)
No description has been provided for this image
In [ ]:

In [28]:
print("RLD and VST transformations")

# Effects of transformations on the variance
rld <- rlog(dds, blind = FALSE)  
vsd <- vst(dds, blind = FALSE) 
ntd <- normTransform(dds)
# meanSdPlot(assay(ntd))
# meanSdPlot(assay(rld))
# meanSdPlot(assay(vsd))
[1] "RLD and VST transformations"
In [29]:
library("pheatmap")

# Select the top 20 differentially expressed genes based on adjusted p-value
top_genes <- rownames(res_AP_vs_AC)[order(res_AP_vs_AC$padj, na.last=NA)][1:20]  #

# Extract normalized transformed counts for the top genes
top_counts <- assay(vsd)[top_genes, ]

# Create annotation dataframe
df <- as.data.frame(colData(dds)["condition"])  # Ensure it is a proper dataframe
colnames(df) <- "Condition"  # Rename column for clarity

# Generate heatmap
options(repr.plot.width = 6, repr.plot.height = 6)
pheatmap(top_counts, 
         cluster_rows=TRUE,  # Cluster rows to group similar genes
         show_rownames=TRUE,  # Show gene names
         cluster_cols=TRUE,  # Cluster samples
         annotation_col=df,  # Add sample condition annotations
         scale="row",  # Normalize each gene's expression across samples
         fontsize_row=8)  # Adjust row text size for readability
No description has been provided for this image
In [30]:
print("PCA and MDS plots of rlog- and vst-transformed counts")
[1] "PCA and MDS plots of rlog- and vst-transformed counts"
In [31]:
# 1. rlog transformation and PCA
rld <- rlog(dds, blind = FALSE)
pca_rld <- plotPCA(rld, intgroup = "condition", returnData = TRUE)
percentVar_rld <- round(100 * attr(pca_rld, "percentVar"))

pca_rld_plot <- ggplot(pca_rld, aes(PC1, PC2, color = condition)) +
  geom_point(size = 4) +
  scale_color_brewer(palette = "Dark2") +
  xlab(paste0("PC1: ", percentVar_rld[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar_rld[2], "% variance")) +
  ggtitle("PCA (rlog)") +
  theme_minimal()

# 2. rlog MDS
dists_rld <- dist(t(assay(rld)))
mds_rld <- cmdscale(as.matrix(dists_rld))
mds_rld_df <- data.frame(MDS1 = mds_rld[,1], MDS2 = mds_rld[,2], condition = col_data$condition)

mds_rld_plot <- ggplot(mds_rld_df, aes(MDS1, MDS2, color = condition)) +
  geom_point(size = 4) +
  scale_color_brewer(palette = "Dark2") +
  ggtitle("MDS (rlog)") +
  theme_minimal()

# 3. vst transformation and PCA
vsd <- vst(dds, blind = FALSE)
pca_vsd <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
percentVar_vsd <- round(100 * attr(pca_vsd, "percentVar"))

pca_vsd_plot <- ggplot(pca_vsd, aes(PC1, PC2, color = condition)) +
  geom_point(size = 4) +
  scale_color_brewer(palette = "Dark2") +
  xlab(paste0("PC1: ", percentVar_vsd[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar_vsd[2], "% variance")) +
  ggtitle("PCA (vst)") +
  theme_minimal()

# 4. vst MDS
dists_vsd <- dist(t(assay(vsd)))
mds_vsd <- cmdscale(as.matrix(dists_vsd))
mds_vsd_df <- data.frame(MDS1 = mds_vsd[,1], MDS2 = mds_vsd[,2], condition = col_data$condition)

mds_vsd_plot <- ggplot(mds_vsd_df, aes(MDS1, MDS2, color = condition)) +
  geom_point(size = 4) +
  scale_color_brewer(palette = "Dark2") +
  ggtitle("MDS (vst)") +
  theme_minimal()

# Combine side-by-side
options(repr.plot.width = 12, repr.plot.height = 8)
# Combine all plots in a 2x2 grid
plot_grid(
  pca_rld_plot, mds_rld_plot,
  pca_vsd_plot, mds_vsd_plot,
  labels = c("A", "B", "C", "D"),
  ncol = 2, align = "hv"
)
using ntop=500 top features by variance

using ntop=500 top features by variance

No description has been provided for this image
In [ ]:

In [32]:
library(pheatmap)
library(RColorBrewer)
library(gridExtra)
library(grid)

# === RLOG Heatmap ===
rlog_matrix <- assay(rld)
sampleDists_rlog <- dist(t(rlog_matrix))
sampleDistMatrix_rlog <- as.matrix(sampleDists_rlog)
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)

p1 <- pheatmap(sampleDistMatrix_rlog,
               clustering_distance_rows = sampleDists_rlog,
               clustering_distance_cols = sampleDists_rlog,
               col = colors,
               fontsize_row = 10,
               fontsize_col = 10,
               cellwidth = 60,
               cellheight = 60,
               angle_col = 45,
               main = "Sample Distance Heatmap (rlog)",
               silent = TRUE)

# === VST Heatmap ===
vsd_matrix <- assay(vsd)
sampleDists_vsd <- dist(t(vsd_matrix))
sampleDistMatrix_vsd <- as.matrix(sampleDists_vsd)

p2 <- pheatmap(sampleDistMatrix_vsd,
               clustering_distance_rows = sampleDists_vsd,
               clustering_distance_cols = sampleDists_vsd,
               col = colors,
               fontsize_row = 10,
               fontsize_col = 10,
               cellwidth = 60,
               cellheight = 60,
               angle_col = 45,
               main = "Sample Distance Heatmap (vst)",
               silent = TRUE)

# === Combine with spacing and ensure layout fits ===
grid.newpage()  # Ensures fresh drawing surface

# Combine side-by-side
options(repr.plot.width = 20, repr.plot.height = 10)

# === Convert pheatmap outputs to grobs ===
grob1 <- p1[[4]]
grob2 <- p2[[4]]

# === Combine with cowplot ===
cowplot::plot_grid(grob1, grob2, ncol = 2, rel_widths = c(1, 1))
Attaching package: 'gridExtra'


The following object is masked from 'package:dplyr':

    combine


The following object is masked from 'package:Biobase':

    combine


The following object is masked from 'package:BiocGenerics':

    combine


No description has been provided for this image
In [33]:
print("Methods to use : GLM-PCA for PCA and PoissonDistance to calculate the sample distances")
# Another option for calculating sample distances is to use the Poisson Distance (Witten 2011), implemented in the PoiClaClu package.
# This measure of dissimilarity between counts also takes the inherent variance structure of counts into consideration when calculating
# the distances between samples. The PoissonDistance function takes the original count matrix (not normalized) with samples as rows
# instead of columns, so we need to transpose the counts in dds.
[1] "Methods to use : GLM-PCA for PCA and PoissonDistance to calculate the sample distances"
In [34]:
print("PCA by using GLMPCA library. RLOG and VSD transformations are more suitable than scale().")

# === GLM-PCA plot ===
gpca <- glmpca(assay(dds), L = 2)
gpca.dat <- gpca$factors
gpca.dat$sample <- colnames(dds)
gpca.dat$condition <- colData(dds)$condition

p_gpca <- ggplot(gpca.dat, aes(x = dim1, y = dim2, color = condition)) +
  geom_point(size = 4.5, alpha = 0.85, stroke = 1) +
  geom_text_repel(aes(label = sample), size = 4, box.padding = 0.4, max.overlaps = 8) +
  coord_fixed() +
  theme_minimal(base_size = 16) +
  labs(title = "GLM-PCA", x = "GLM-PC1", y = "GLM-PC2") +
  scale_color_brewer(palette = "Set2") +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold", size = 16)
  )

# === Poisson distance heatmap ===
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix(poisd$dd)

sample_names <- colnames(dds)
rownames(samplePoisDistMatrix) <- sample_names
colnames(samplePoisDistMatrix) <- sample_names

colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)

pheat <- pheatmap(samplePoisDistMatrix,
                  clustering_distance_rows = poisd$dd,
                  clustering_distance_cols = poisd$dd,
                  col = colors,
                  fontsize_row = 10,
                  fontsize_col = 10,
                  cellwidth = 50,
                  cellheight = 50,
                  angle_col = 45,
                  main = "Poisson Distance Heatmap",
                  silent = TRUE)

# Convert to grob for use with cowplot
g_poisson <- ggdraw(grobTree(pheat$gtable)) + theme(plot.margin = margin(5, 5, 5, 5))

# Set display size
options(repr.plot.width = 16, repr.plot.height = 8)

# Combine plots with cleaner spacing
combined_plot <- plot_grid(
  g_poisson, p_gpca,
  labels = c("A", "B"),
  label_size = 16,
  nrow = 1,
  rel_widths = c(1.2, 1)
)

# Print
print(combined_plot)
[1] "PCA by using GLMPCA library. RLOG and VSD transformations are more suitable than scale()."
No description has been provided for this image
In [ ]:

In [35]:
print("Performing Surrogate Variable Analysis")
print("SVA analysis")
# SV1, SV2, ... are surrogate variables — latent (hidden) factors estimated from the data that capture unwanted variation 
# (like batch effects, technical noise, or hidden biological subtypes).
# You can think of them as "virtual covariates" — constructed purely from the structure of your data — 
# that explain sources of variation not included in your model (like treatment or condition).
[1] "Performing Surrogate Variable Analysis"
[1] "SVA analysis"
In [36]:
# === Prepare data ===
dat  <- counts(dds, normalized = TRUE)
idx  <- rowMeans(dat) > 1
dat  <- dat[idx, ]

mod  <- model.matrix(~ condition, colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))

svseq <- svaseq(dat, mod, mod0, n.sv = 2)
head(svseq$sv, 2)

# Set layout: 1 row, 2 columns
par(
  mfrow = c(1, 2),
  mar = c(5, 5, 4, 2) + 0.1,  # bottom, left, top, right margins
  cex.main = 1.4,             # title size
  cex.axis = 1.1,             # axis label size
  cex.lab = 1.2,              # axis title size
  las = 1                     # y-axis labels horizontal
)

# Loop through SV1 and SV2
for (i in 1:2) {
  stripchart(
    svseq$sv[, i] ~ dds$condition,
    vertical = TRUE,
    method = "jitter",
    pch = 21,
    bg = "steelblue",
    col = "black",
    frame.plot = FALSE,
    ylim = c(-0.8, 0.8),  # fixed y-axis range
    main = paste0("Surrogate Variable SV", i),
    ylab = "Surrogate Variable Value",
    xlab = "Condition",
    cex = 1.3
  )
  abline(h = 0, lty = 2, col = "gray50", lwd = 1.5)
}
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
A matrix: 2 × 2 of type dbl
-0.3879922 0.0142916
-0.2524473-0.1711288
No description has been provided for this image
In [37]:
# Finally, in order to use SVA to remove any effect on the counts from our surrogate variables, we simply add these two surrogate variables 
# as columns to the DESeqDataSet and then add them to the design:

ddssva <- dds
ddssva$SV1 <- svseq$sv[,1]
ddssva$SV2 <- svseq$sv[,2]
design(ddssva) <- ~ SV1 + SV2 + condition
  
ddssva$SV1
ddssva$SV2

# length(ddssva$SV1)
# length(ddssva$SV2)

ddssva <- DESeq(ddssva)
resultsNames(ddssva)

# rowRanges(ddssva)
# colData(ddssva)
# assays(ddssva)
# assay(ddssva)
# length(rowRanges(ddssva))

res_ddssva <- results(ddssva)
resultsNames(res_ddssva)

# Get results for different comparisons
res_ddssva_AP_vs_AC <- results(ddssva, contrast = c("condition", "AP", "AC"))
res_ddssva_AP_vs_AV <- results(ddssva, contrast = c("condition", "AP", "AV"))
res_ddssva_AC_vs_AV <- results(ddssva, contrast = c("condition", "AC", "AV"))

# summary(res_ddssva_AP_vs_AV)
# summary(res_ddssva_AC_vs_AV)
# summary(res_ddssva_AP_vs_AC)

# Save results
write.csv(as.data.frame(res_ddssva_AP_vs_AC), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AC_results.sva.csv")
write.csv(as.data.frame(res_ddssva_AP_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AV_results.sva.csv")
write.csv(as.data.frame(res_ddssva_AC_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AC_vs_AV_results.sva.csv")

###########################################################
###########################################################

print("number of differentially bound and expressed transcripts : AP vs AC : pvalue < 0.05, and padj < 0.1")
dim(subset(res_ddssva_AP_vs_AC, pvalue < 0.05))
dim(subset(res_ddssva_AP_vs_AC, padj < 0.1))

print("number of differentially bound and expressed transcripts : AP vs AV : pvalue < 0.05, and padj < 0.1")
dim(subset(res_ddssva_AP_vs_AV, pvalue < 0.05))
dim(subset(res_ddssva_AP_vs_AV, padj < 0.1))

print("number of differentially bound and expressed transcripts : AC vs AV : pvalue < 0.05, and padj < 0.1")
dim(subset(res_ddssva_AC_vs_AV, pvalue < 0.05))
dim(subset(res_ddssva_AC_vs_AV, padj < 0.1))
  1. -0.387992165531141
  2. -0.252447306761482
  3. -0.0540895790399508
  4. -0.390608883926965
  5. 0.686564253598224
  6. 0.398573681661316
  1. 0.014291602523688
  2. -0.171128761832017
  3. -0.284197009646111
  4. 0.115234912859809
  5. -0.478735429404207
  6. 0.804534685498839
using pre-existing size factors

estimating dispersions

found already estimated dispersions, replacing these

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

  1. 'Intercept'
  2. 'SV1'
  3. 'SV2'
  4. 'condition_AC_vs_AV'
  5. 'condition_AP_vs_AV'
[1] "number of differentially bound and expressed transcripts : AP vs AC : pvalue < 0.05, and padj < 0.1"
  1. 820
  2. 6
  1. 266
  2. 6
[1] "number of differentially bound and expressed transcripts : AP vs AV : pvalue < 0.05, and padj < 0.1"
  1. 262
  2. 6
  1. 41
  2. 6
[1] "number of differentially bound and expressed transcripts : AC vs AV : pvalue < 0.05, and padj < 0.1"
  1. 263
  2. 6
  1. 37
  2. 6
In [38]:
# Transform count data
vsd2 <- vst(ddssva, blind = TRUE)
rld2 <- rlog(ddssva, blind = TRUE)

# Get PCA data
pca_vsd <- plotPCA(vsd2, intgroup = "condition", returnData = TRUE)
pca_rld <- plotPCA(rld2, intgroup = "condition", returnData = TRUE)

# Variance explained
percentVar_vsd <- round(100 * attr(pca_vsd, "percentVar"))
percentVar_rld <- round(100 * attr(pca_rld, "percentVar"))

# PCA plot for VST
p1 <- ggplot(pca_vsd, aes(PC1, PC2, color = condition)) +
  geom_point(size = 3, alpha = 0.8) +
  labs(
    title = "PCA after SVA (VST)",
    x = paste0("PC1 (", percentVar_vsd[1], "%)"),
    y = paste0("PC2 (", percentVar_vsd[2], "%)")
  ) +
  theme_minimal(base_size = 14) +
  scale_color_brewer(palette = "Set2") +
  theme(legend.position = "right")

# PCA plot for RLOG
p2 <- ggplot(pca_rld, aes(PC1, PC2, color = condition)) +
  geom_point(size = 3, alpha = 0.8) +
  labs(
    title = "PCA after SVA (RLOG)",
    x = paste0("PC1 (", percentVar_rld[1], "%)"),
    y = paste0("PC2 (", percentVar_rld[2], "%)")
  ) +
  theme_minimal(base_size = 14) +
  scale_color_brewer(palette = "Set2") +
  theme(legend.position = "right")

# Show both plots side by side with legends
options(repr.plot.width = 14, repr.plot.height = 6)
plot_grid(p1, p2, labels = c("A", "B"), ncol = 2)
using ntop=500 top features by variance

using ntop=500 top features by variance

No description has been provided for this image
In [39]:
print("RUVseq analysis")
[1] "RUVseq analysis"
In [40]:
library(RUVSeq)
library(DESeq2)

# Create SeqExpressionSet from DESeq2 object
set <- newSeqExpressionSet(counts(dds))

# Keep genes with sufficient expression
idx  <- rowSums(counts(set) > 5) >= 2
set  <- set[idx, ]

# Normalize
set <- betweenLaneNormalization(set, which = "upper")

# Run DESeq2 just to get raw p-values for empirical control genes
dds_temp <- dds[idx, ]
dds_temp <- DESeq(dds_temp)
res_temp <- results(dds_temp)

# Define empirical control genes as those with high p-value (non-DE)
not.sig <- rownames(res_temp)[which(res_temp$pvalue > 0.1)]
empirical <- rownames(set)[rownames(set) %in% not.sig]

# Apply RUVg with k=2 unwanted factors
set <- RUVg(set, empirical, k = 2)

# Add unwanted factors to DESeq2 design
ddsruv <- dds[idx, ]  # use filtered genes
ddsruv$W1 <- set$W_1
ddsruv$W2 <- set$W_2
design(ddsruv) <- ~ W1 + W2 + condition

# Run DESeq2 with adjusted design
ddsruv <- DESeq(ddsruv)

# Check model variables
resultsNames(ddsruv)

# Get results for different comparisons
res_ddsruv_AP_vs_AC <- results(ddsruv, contrast = c("condition", "AP", "AC"))
res_ddsruv_AP_vs_AV <- results(ddsruv, contrast = c("condition", "AP", "AV"))
res_ddsruv_AC_vs_AV <- results(ddsruv, contrast = c("condition", "AC", "AV"))

# Save results
write.csv(as.data.frame(res_ddsruv_AP_vs_AC), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AC_results.ruv.csv")
write.csv(as.data.frame(res_ddsruv_AP_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AP_vs_AV_results.ruv.csv")
write.csv(as.data.frame(res_ddsruv_AC_vs_AV), file = "A549.peaks.IP.samples.DESeq2_AC_vs_AV_results.ruv.csv")

# Summary statistics
print("number of differentially bound and expressed transcripts : AP vs AC : pvalue < 0.05, and padj < 0.1")
print(dim(subset(res_ddsruv_AP_vs_AC, pvalue < 0.05)))
print(dim(subset(res_ddsruv_AP_vs_AC, padj < 0.1)))

print("number of differentially bound and expressed transcripts : AP vs AV : pvalue < 0.05, and padj < 0.1")
print(dim(subset(res_ddsruv_AP_vs_AV, pvalue < 0.05)))
print(dim(subset(res_ddsruv_AP_vs_AV, padj < 0.1)))

print("number of differentially bound and expressed transcripts : AC vs AV : pvalue < 0.05, and padj < 0.1")
print(dim(subset(res_ddsruv_AC_vs_AV, pvalue < 0.05)))
print(dim(subset(res_ddsruv_AC_vs_AV, padj < 0.1)))
using pre-existing size factors

estimating dispersions

found already estimated dispersions, replacing these

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

using pre-existing size factors

estimating dispersions

found already estimated dispersions, replacing these

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

  1. 'Intercept'
  2. 'W1'
  3. 'W2'
  4. 'condition_AC_vs_AV'
  5. 'condition_AP_vs_AV'
[1] "number of differentially bound and expressed transcripts : AP vs AC : pvalue < 0.05, and padj < 0.1"
[1] 488   6
[1] 76  6
[1] "number of differentially bound and expressed transcripts : AP vs AV : pvalue < 0.05, and padj < 0.1"
[1] 433   6
[1] 83  6
[1] "number of differentially bound and expressed transcripts : AC vs AV : pvalue < 0.05, and padj < 0.1"
[1] 253   6
[1] 33  6
In [41]:
# Set layout to 1 row, 2 columns (side by side)
par(
  mfrow = c(1, 2),
  mar = c(4, 4, 3, 1),   # margins: bottom, left, top, right
  cex.main = 1.2,        # title size
  cex.axis = 1.0,        # axis tick label size
  cex.lab = 1.1,         # axis title size
  las = 1                # horizontal y-axis labels
)

# Loop over W1 and W2
for (i in 1:2) {
  stripchart(
    pData(set)[, i] ~ dds$condition,
    vertical = TRUE,
    method = "jitter",
    pch = 21,
    bg = "steelblue",
    col = "black",
    frame.plot = FALSE,
    main = paste("W", i),
    ylab = "Factor Value",
    xlab = "Condition",
    cex = 1.1
  )
  abline(h = 0, lty = 2, col = "gray60", lwd = 1)
}
No description has been provided for this image
In [42]:
# Transform count data from ddsruv
vsd3 <- vst(ddsruv, blind = TRUE)
rld3 <- rlog(ddsruv, blind = TRUE)

# Get PCA data
pca_vsd <- plotPCA(vsd3, intgroup = "condition", returnData = TRUE)
pca_rld <- plotPCA(rld3, intgroup = "condition", returnData = TRUE)

# Variance explained
percentVar_vsd <- round(100 * attr(pca_vsd, "percentVar"))
percentVar_rld <- round(100 * attr(pca_rld, "percentVar"))

# PCA plot for VST
p1 <- ggplot(pca_vsd, aes(PC1, PC2, color = condition)) +
  geom_point(size = 3, alpha = 0.8) +
  labs(
    title = "PCA after RUV (VST)",
    x = paste0("PC1 (", percentVar_vsd[1], "%)"),
    y = paste0("PC2 (", percentVar_vsd[2], "%)")
  ) +
  theme_minimal(base_size = 14) +
  scale_color_brewer(palette = "Set2") +
  theme(legend.position = "right")

# PCA plot for RLOG
p2 <- ggplot(pca_rld, aes(PC1, PC2, color = condition)) +
  geom_point(size = 3, alpha = 0.8) +
  labs(
    title = "PCA after RUV (RLOG)",
    x = paste0("PC1 (", percentVar_rld[1], "%)"),
    y = paste0("PC2 (", percentVar_rld[2], "%)")
  ) +
  theme_minimal(base_size = 14) +
  scale_color_brewer(palette = "Set2") +
  theme(legend.position = "right")

# Show both plots side by side with legends
options(repr.plot.width = 14, repr.plot.height = 6)
plot_grid(p1, p2, labels = c("A", "B"), ncol = 2)
using ntop=500 top features by variance

using ntop=500 top features by variance

No description has been provided for this image
In [ ]:

In [43]:
library(EnhancedVolcano)

# Color	Label in legend	Meaning
# Grey	NS	Not Significant – the gene did not pass the p-value or log₂FC thresholds
# Green	Log₂ FC	The gene passed the log₂ fold change cutoff but not the p-value cutoff
# Blue	p-value	The gene passed the p-value cutoff but not the log₂FC cutoff
# Red	p value and log₂ FC	The gene passed both p-value and log₂FC thresholds — most interesting hits

# Define thresholds
pval_cutoff <- 0.05
lfc_cutoff <- 0.5

# Set up the plotting window size for a more compact layout
options(repr.plot.width = 10, repr.plot.height = 8)

EnhancedVolcano(res_AP_vs_AC,
                lab = rownames(res_AP_vs_AC),
                x = 'log2FoldChange',
                y = 'pvalue',
                pCutoff = pval_cutoff,
                FCcutoff = lfc_cutoff,
                title = 'AP vs AC',
                pointSize = 2.0,
                labSize = 3.0,
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 3.0,
                drawConnectors = TRUE,
                widthConnectors = 0.5,
                boxedLabels = FALSE)
Warning message:
"ggrepel: 630 unlabeled data points (too many overlaps). Consider increasing max.overlaps"
No description has been provided for this image
In [ ]:

In [44]:
library(clusterProfiler)
library(org.Hs.eg.db) 
library(GO.db)         
library(DO.db)         
library(KEGGREST)      
library(ReactomePA)    
library(enrichplot)    
library(dplyr)
library(msigdbr)
library(msigdb)
library(msigdf)
library(msigdbdf)
clusterProfiler v4.14.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
enrichment tool for interpreting omics data. The Innovation. 2021,
2(3):100141


Attaching package: 'clusterProfiler'


The following object is masked from 'package:AnnotationDbi':

    select


The following object is masked from 'package:XVector':

    slice


The following object is masked from 'package:purrr':

    simplify


The following object is masked from 'package:IRanges':

    slice


The following object is masked from 'package:S4Vectors':

    rename


The following object is masked from 'package:stats':

    filter


ReactomePA v1.50.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for
reactome pathway analysis and visualization. Molecular BioSystems.
2016, 12(2):477-479

enrichplot v1.26.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an
R/Bioconductor package for Disease Ontology Semantic and Enrichment
analysis. Bioinformatics. 2015, 31(4):608-609

In [45]:
# Define thresholds
pval_cutoff <- 0.05
lfc_cutoff <- 0.5
fin_name = "A549.peaks.IP.samples."
In [46]:
res <- res_AP_vs_AC  
head(res,3)
log2 fold change (MLE): condition AP vs AC 
Wald test p-value: condition AP vs AC 
DataFrame with 3 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat    pvalue
                <numeric>      <numeric> <numeric> <numeric> <numeric>
ENSG00000310528   11.7191      0.0419129  0.728597 0.0575255  0.954127
ENSG00000310527   54.1219      0.5556707  0.385164 1.4426877  0.149108
ENSG00000225880  658.6745      0.0392962  0.237716 0.1653074  0.868702
                     padj
                <numeric>
ENSG00000310528        NA
ENSG00000310527  0.838676
ENSG00000225880  0.997640
In [47]:
# Filter significant genes
res_sig <- as.data.frame(res) %>%
  rownames_to_column("gene") %>%
  filter(padj < pval_cutoff & abs(log2FoldChange) > lfc_cutoff)

# Map ENSEMBL IDs to Entrez IDs
gene_ids <- bitr(res_sig$gene, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
res_merge <- merge(res_sig, gene_ids, by.x = "gene", by.y = "ENSEMBL")

# Prepare named gene list for GSEA
gene_list2 <- setNames(res_merge$log2FoldChange, res_merge$ENTREZID)
gene_list2 <- sort(gene_list2, decreasing = TRUE)

head(gene_list2, 2)
length(gene_list2)
'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(res_sig$gene, fromType = "ENSEMBL", toType = "ENTREZID", :
"4.4% of input gene IDs are fail to map..."
80224
2.15110149990199
5793
2.0994743686027
87
In [48]:
library(clusterProfiler)
library(org.Hs.eg.db)
library(cowplot)

# Predefine result objects to avoid "not found" error
result <- NULL
result2 <- NULL

# GO Over-Representation Analysis (ORA)
result <- tryCatch({
  ego <- enrichGO(gene          = gene_ids$ENTREZID,
                  OrgDb         = org.Hs.eg.db,
                  ont           = "BP",
                  keyType       = "ENTREZID",  
                  pAdjustMethod = "BH",
                  pvalueCutoff  = 0.05,
                  qvalueCutoff  = 1,
                  readable      = TRUE)
  
  # Save results
  write.table(ego@result, file = paste0(fin_name, "_GO_OverRepresentation_Results.txt"), 
              row.names = FALSE, col.names = TRUE, quote = FALSE)

  # Save PNG
  png(paste0(fin_name, "_GO_OverRepresentation.png"), width = 1000, height = 800)
  print(dotplot(ego, showCategory = 20, title = "GO ORA (BP)"))
  dev.off()

  # Return ggplot object
  dotplot(ego, showCategory = 20, title = "GO ORA (BP)")

}, error = function(e) {
  cat("Error in GO ORA:", conditionMessage(e), "\n")
  NULL
})


# GO Enrichment Analysis (GSEA)
result2 <- tryCatch({
  ego2 <- gseGO(gene          = gene_list2,
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",  
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05)

  if (nrow(ego2@result) > 0) {
    write.table(ego2@result, file = paste0(fin_name, "_GO_Enrichment_Results.txt"), 
                row.names = FALSE, col.names = TRUE, quote = FALSE)

    png(paste0(fin_name, "_GO_Enrichment_Plot.png"), width = 1000, height = 800)
    print(dotplot(ego2, showCategory = 20, title = "GO GSEA (BP)"))
    dev.off()

    dotplot(ego2, showCategory = 20, title = "GO GSEA (BP)")
  } else {
    cat("No enriched terms in GSEA under pvalueCutoff.\n")
    NULL
  }

}, error = function(e) {
  cat("Error in GO GSEA:", conditionMessage(e), "\n")
  NULL
})

# === Display plots side by side if both exist ===
options(repr.plot.width = 6, repr.plot.height = 6)

if (!is.null(result) && !is.null(result2)) {
  plot_grid(result, result2, labels = c("A", "B"), ncol = 2, rel_widths = c(1, 1))
} else if (!is.null(result)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result)
} else if (!is.null(result2)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result2)
} else {
  cat("No enrichment plots to display.\n")
}
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).


preparing geneSet collections...

GSEA analysis...

no term enriched under specific pvalueCutoff...

No enriched terms in GSEA under pvalueCutoff.
No description has been provided for this image
In [ ]:

In [49]:
# === KEGG Over-Representation Analysis (ORA) ===
result <- tryCatch({

  kegg_enrich <- enrichKEGG(
    gene           = gene_ids$ENTREZID,
    organism       = "hsa",
    pAdjustMethod  = "BH",
    pvalueCutoff   = 0.05
  )

  if (!is.null(kegg_enrich) && nrow(kegg_enrich@result) > 0) {

    # Save results
    write.table(kegg_enrich@result,
                file = paste0(fin_name, "_KEGG_OverRepresentation_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    # Save PNG plot
    png(paste0(fin_name, "_KEGG_OverRepresentation_Plot.png"), width = 1000, height = 800)
    print(dotplot(kegg_enrich, showCategory = 20, title = "KEGG ORA"))
    dev.off()

    # Return plot object
    return(dotplot(kegg_enrich, showCategory = 20, title = "KEGG ORA"))

  } else {
    cat("⚠️ No enriched KEGG terms found in ORA.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in KEGG ORA:", conditionMessage(e), "\n")
  return(NULL)
})



# === KEGG Gene Set Enrichment Analysis (GSEA) ===
result2 <- tryCatch({

  kegg_gse <- gseKEGG(
    geneList      = gene_list2,
    organism      = "hsa",
    minGSSize     = 120,
    pvalueCutoff  = 0.05,
    verbose       = FALSE
  )

  if (!is.null(kegg_gse) && nrow(kegg_gse@result) > 0) {

    # Save results
    write.table(kegg_gse@result,
                file = paste0(fin_name, "_KEGG_Enrichment_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    # Save PNG plot
    png(paste0(fin_name, "_KEGG_Enrichment_Plot.png"), width = 1000, height = 800)
    print(dotplot(kegg_gse, showCategory = 20, title = "KEGG GSEA"))
    dev.off()

    # Return plot object
    return(dotplot(kegg_gse, showCategory = 20, title = "KEGG GSEA"))

  } else {
    cat("⚠️ No enriched KEGG terms found in GSEA.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in KEGG GSEA:", conditionMessage(e), "\n")
  return(NULL)
})



# === Display Plots Nicely ===
# Set default figure size
options(repr.plot.width = 6, repr.plot.height = 6)

if (!is.null(result) && !is.null(result2)) {
  # Side-by-side
  options(repr.plot.width = 12, repr.plot.height = 6)
  plot_grid(result, result2, labels = c("A", "B"), ncol = 2, rel_widths = c(1, 1))

} else if (!is.null(result)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result)

} else if (!is.null(result2)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result2)

} else {
  cat("⚠️ No KEGG enrichment plots to display.\n")
}
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...

Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

Warning message in rep(yes, length.out = len):
"'x' is NULL so the result will be NULL"
❌ Error in KEGG ORA: replacement has length zero 
no term enriched under specific pvalueCutoff...

⚠️ No enriched KEGG terms found in GSEA.
NULL
⚠️ No KEGG enrichment plots to display.
In [50]:
library(clusterProfiler)
library(enrichplot)
library(cowplot)
library(pathview)         # Optional for WikiPathways
library(dplyr)
library(DOSE)
library(rWikiPathways)

# === WikiPathways Over-Representation Analysis (ORA) ===
result <- tryCatch({

  wikipathways_enrich <- enrichWP(
    gene           = gene_ids$ENTREZID,
    organism       = "Homo sapiens",
    pvalueCutoff   = 0.05
  )

  if (!is.null(wikipathways_enrich) && nrow(wikipathways_enrich@result) > 0) {

    # Save results
    write.table(wikipathways_enrich@result,
                file = paste0(fin_name, "_WikiPathways_ORA_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    # Save plot
    png(paste0(fin_name, "_WikiPathways_ORA_Plot.png"), width = 1000, height = 800)
    print(dotplot(wikipathways_enrich, showCategory = 20, title = "WikiPathways ORA"))
    dev.off()

    # Return plot object
    return(dotplot(wikipathways_enrich, showCategory = 20, title = "WikiPathways ORA"))

  } else {
    cat("⚠️ No enriched WikiPathways terms found in ORA.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in WikiPathways ORA:", conditionMessage(e), "\n")
  return(NULL)
})



# === WikiPathways Gene Set Enrichment Analysis (GSEA) ===
result2 <- tryCatch({

  wikipathways_gse <- gseWP(
    gene          = gene_list2,
    organism      = "Homo sapiens",
    pvalueCutoff  = 0.05
  )

  if (!is.null(wikipathways_gse) && nrow(wikipathways_gse@result) > 0) {

    # Save results
    write.table(wikipathways_gse@result,
                file = paste0(fin_name, "_WikiPathways_Enrichment_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    # Save plot
    png(paste0(fin_name, "_WikiPathways_Enrichment_Plot.png"), width = 1000, height = 800)
    print(dotplot(wikipathways_gse, showCategory = 20, title = "WikiPathways GSEA"))
    dev.off()

    # Return plot object
    return(dotplot(wikipathways_gse, showCategory = 20, title = "WikiPathways GSEA"))

  } else {
    cat("⚠️ No enriched WikiPathways terms found in GSEA.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in WikiPathways GSEA:", conditionMessage(e), "\n")
  return(NULL)
})



# === Display plots ===
options(repr.plot.width = 5, repr.plot.height = 5)  # Default size

if (!is.null(result) && !is.null(result2)) {
  # Show both plots side-by-side
  options(repr.plot.width = 12, repr.plot.height = 6)
  plot_grid(result, result2, labels = c("A", "B"), ncol = 2, rel_widths = c(1, 1))

} else if (!is.null(result)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result)

} else if (!is.null(result2)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result2)

} else {
  cat("⚠️ No WikiPathways enrichment plots to display.\n")
}
##############################################################################
Pathview is an open source software package distributed under GNU General
Public License version 3 (GPLv3). Details of GPLv3 is available at
http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
formally cite the original Pathview paper (not just mention it) in publications
or products. For details, do citation("pathview") within R.

The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
license agreement (details at http://www.kegg.jp/kegg/legal.html).
##############################################################################

DOSE v4.0.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an
R/Bioconductor package for Disease Ontology Semantic and Enrichment
analysis. Bioinformatics. 2015, 31(4):608-609


Attaching package: 'rWikiPathways'


The following object is masked from 'package:edgeR':

    getCounts


The following object is masked from 'package:GenomeInfoDb':

    listOrganisms


using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).


preparing geneSet collections...

GSEA analysis...

no term enriched under specific pvalueCutoff...

⚠️ No enriched WikiPathways terms found in GSEA.
NULL
⚠️ No WikiPathways enrichment plots to display.
No description has been provided for this image
In [ ]:

In [51]:
library(ReactomePA)
library(clusterProfiler)
library(enrichplot)
library(cowplot)

# === Reactome Over-Representation Analysis (ORA) ===
result <- tryCatch({

  reactome_ora <- enrichPathway(
    gene           = gene_ids$ENTREZID,
    organism       = "human",
    pAdjustMethod  = "BH",
    pvalueCutoff   = 0.05
  )

  if (!is.null(reactome_ora) && nrow(reactome_ora@result) > 0) {

    # Save results
    write.table(reactome_ora@result,
                file = paste0(fin_name, "_Reactome_ORA_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    # Save plot
    png(paste0(fin_name, "_Reactome_ORA_Plot.png"), width = 1000, height = 800)
    print(dotplot(reactome_ora, showCategory = 20, title = "Reactome ORA"))
    dev.off()

    # Return plot for screen
    return(dotplot(reactome_ora, showCategory = 20, title = "Reactome ORA"))

  } else {
    cat("⚠️ No enriched Reactome terms found in ORA.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in Reactome ORA:", conditionMessage(e), "\n")
  return(NULL)
})


# === Reactome GSEA Analysis ===
result2 <- tryCatch({

  reactome_gsea <- gsePathway(
    gene            = gene_list2,
    organism        = "human",
    pAdjustMethod   = "BH",
    pvalueCutoff    = 0.05
  )

  if (!is.null(reactome_gsea) && nrow(reactome_gsea@result) > 0) {

    # Save results
    write.table(reactome_gsea@result,
                file = paste0(fin_name, "_Reactome_Enrichment_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    # Save plot
    png(paste0(fin_name, "_Reactome_Enrichment_Plot.png"), width = 1000, height = 800)
    print(dotplot(reactome_gsea, showCategory = 20, title = "Reactome GSEA"))
    dev.off()

    # Return plot for screen
    return(dotplot(reactome_gsea, showCategory = 20, title = "Reactome GSEA"))

  } else {
    cat("⚠️ No enriched Reactome terms found in GSEA.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in Reactome GSEA:", conditionMessage(e), "\n")
  return(NULL)
})


# === Display plots ===
options(repr.plot.width = 6, repr.plot.height = 6)

if (!is.null(result) && !is.null(result2)) {
  # Show both side-by-side
  options(repr.plot.width = 12, repr.plot.height = 6)
  plot_grid(result, result2, labels = c("A", "B"), ncol = 2, rel_widths = c(1, 1))

} else if (!is.null(result)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result)

} else if (!is.null(result2)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result2)

} else {
  cat("⚠️ No Reactome enrichment plots to display.\n")
}
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).


preparing geneSet collections...

GSEA analysis...

no term enriched under specific pvalueCutoff...

⚠️ No enriched Reactome terms found in GSEA.
NULL
⚠️ No Reactome enrichment plots to display.
No description has been provided for this image
In [52]:
library(msigdbr)
library(clusterProfiler)
library(enrichplot)
library(cowplot)
library(dplyr)

# === Prepare MSigDB C2 gene sets ===
msig_genesets <- msigdbr(species = "Homo sapiens", category = "C2")
C2_t2g <- msig_genesets %>% dplyr::select(gs_name, entrez_gene)

# Use gene_ids$ENTREZID for ORA, and named gene_list2 for GSEA
gene_list <- gene_ids$ENTREZID  # for ORA

# === MSigDB Over-Representation Analysis (ORA) ===
result <- tryCatch({

  msig_enrich <- enricher(
    gene       = gene_list,
    TERM2GENE  = C2_t2g
  )

  if (!is.null(msig_enrich) && nrow(msig_enrich@result) > 0) {

    write.table(msig_enrich@result,
                file = paste0(fin_name, "_MSigDB_OverRepresentation_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    png(paste0(fin_name, "_MSigDB_ORA_Plot.png"), width = 1000, height = 800)
    print(dotplot(msig_enrich, showCategory = 20, title = "MSigDB ORA"))
    dev.off()

    return(dotplot(msig_enrich, showCategory = 20, title = "MSigDB ORA"))

  } else {
    cat("⚠️ No significant MSigDB pathways found in ORA.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in MSigDB ORA:", conditionMessage(e), "\n")
  return(NULL)
})


# === MSigDB Gene Set Enrichment Analysis (GSEA) ===
result2 <- tryCatch({

  msig_gsea <- GSEA(
    geneList   = gene_list2,
    TERM2GENE  = C2_t2g,
    pvalueCutoff = 0.05
  )

  if (!is.null(msig_gsea) && nrow(msig_gsea@result) > 0) {

    write.table(msig_gsea@result,
                file = paste0(fin_name, "_MSigDB_GSEA_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    png(paste0(fin_name, "_MSigDB_GSEA_Plot.png"), width = 1000, height = 800)
    print(dotplot(msig_gsea, showCategory = 20, title = "MSigDB GSEA"))
    dev.off()

    return(dotplot(msig_gsea, showCategory = 20, title = "MSigDB GSEA"))

  } else {
    cat("⚠️ No significant MSigDB GSEA pathways found.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in MSigDB GSEA:", conditionMessage(e), "\n")
  return(NULL)
})


# === Display plots ===
options(repr.plot.width = 20, repr.plot.height = 20)

if (!is.null(result) && !is.null(result2)) {
  options(repr.plot.width = 12, repr.plot.height = 6)
  plot_grid(result, result2, labels = c("A", "B"), ncol = 2, rel_widths = c(1, 1))

} else if (!is.null(result)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result)

} else if (!is.null(result2)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result2)

} else {
  cat("⚠️ No MSigDB enrichment plots to display.\n")
}
Warning message:
"The `category` argument of `msigdbr()` is deprecated as of msigdbr 9.0.0.
ℹ Please use the `collection` argument instead."
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).


preparing geneSet collections...

GSEA analysis...

no term enriched under specific pvalueCutoff...

⚠️ No significant MSigDB GSEA pathways found.
NULL
⚠️ No MSigDB enrichment plots to display.
No description has been provided for this image
In [53]:
library(msigdbr)
library(clusterProfiler)
library(enrichplot)
library(cowplot)
library(dplyr)

# === Prepare MSigDB C2 gene sets ===
msig_genesets <- msigdbr(species = "Homo sapiens", category = "C5")
C2_t2g <- msig_genesets %>% dplyr::select(gs_name, entrez_gene)

# Use gene_ids$ENTREZID for ORA, and named gene_list2 for GSEA
gene_list <- gene_ids$ENTREZID  # for ORA

# === MSigDB Over-Representation Analysis (ORA) ===
result <- tryCatch({

  msig_enrich <- enricher(
    gene       = gene_list,
    TERM2GENE  = C2_t2g
  )

  if (!is.null(msig_enrich) && nrow(msig_enrich@result) > 0) {

    write.table(msig_enrich@result,
                file = paste0(fin_name, "_MSigDB_OverRepresentation_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    png(paste0(fin_name, "_MSigDB_ORA_Plot.png"), width = 1000, height = 800)
    print(dotplot(msig_enrich, showCategory = 20, title = "MSigDB ORA"))
    dev.off()

    return(dotplot(msig_enrich, showCategory = 20, title = "MSigDB ORA"))

  } else {
    cat("⚠️ No significant MSigDB pathways found in ORA.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in MSigDB ORA:", conditionMessage(e), "\n")
  return(NULL)
})


# === MSigDB Gene Set Enrichment Analysis (GSEA) ===
result2 <- tryCatch({

  msig_gsea <- GSEA(
    geneList   = gene_list2,
    TERM2GENE  = C2_t2g,
    pvalueCutoff = 0.05
  )

  if (!is.null(msig_gsea) && nrow(msig_gsea@result) > 0) {

    write.table(msig_gsea@result,
                file = paste0(fin_name, "_MSigDB_GSEA_Results.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)

    png(paste0(fin_name, "_MSigDB_GSEA_Plot.png"), width = 1000, height = 800)
    print(dotplot(msig_gsea, showCategory = 20, title = "MSigDB GSEA"))
    dev.off()

    return(dotplot(msig_gsea, showCategory = 20, title = "MSigDB GSEA"))

  } else {
    cat("⚠️ No significant MSigDB GSEA pathways found.\n")
    return(NULL)
  }

}, error = function(e) {
  cat("❌ Error in MSigDB GSEA:", conditionMessage(e), "\n")
  return(NULL)
})


# === Display plots ===
options(repr.plot.width = 10, repr.plot.height = 10)

if (!is.null(result) && !is.null(result2)) {
  options(repr.plot.width = 12, repr.plot.height = 6)
  plot_grid(result, result2, labels = c("A", "B"), ncol = 2, rel_widths = c(1, 1))

} else if (!is.null(result)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result)

} else if (!is.null(result2)) {
  options(repr.plot.width = 7, repr.plot.height = 6)
  print(result2)

} else {
  cat("⚠️ No MSigDB enrichment plots to display.\n")
}
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).


preparing geneSet collections...

GSEA analysis...

leading edge analysis...

done...

No description has been provided for this image
⚠️ No MSigDB enrichment plots to display.
No description has been provided for this image
In [ ]: